Publicly available mouse and human datasets of pre/post gene expression in skeletal muscle were used to project mouse patterns onto human data using ProjectR package. The end goal is to find latent variables in mouse gene expression to project onto human data to make mouse data better recapitulate human data.
Built with R 4.0.5
Lab notebook
Wrike project
Data
Mouse Data: GSE97718
* 16 samples
* Control diet (n=8) high fat diet (n=8)
* Pre and post exercise samples collected from each mouse
* RNA isolated from quadricep skeletal muscle
* fastq’s downloaded from ENA
* data preprocessed with RNA-seq STAR pipeline by Lara Ianov
Human Data: GSE108643
* 59 samples
* Participants divided by BMI: lean ((BMI<25, 18.5- 24.1 kg/m2, n=15) and Ov/Ob (BMI≥25, 25.5- 36.9 kg/m2, n=15)
* Pre and post exercise samples collected from each participant
* RNA isolated from vastus lateralis muscle biopsies
* fastq’s downloaded from ENA
* data preprocessed with RNA-seq STAR pipeline by Lara Ianov
#Load in raw counts for Mouse pre/post exercise experiment
mouse_counts <- read.delim("/Users/eramsey/Desktop/R21_210302/PreliminaryR21/mouse210321_fixed_raw_counts.txt", header = TRUE, row.names = 1, sep = "\t", stringsAsFactors = FALSE)
mouse_counts <- mouse_counts[, -c(1:3)]
Make Mouse Metadata Table
Metadata/info table not supplied with this study. Using GEO, manually created (see here https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE97718)
#metadata not included with this dataset..
mouse_info <- read.csv("MouseSraRunTable.txt", header = TRUE, sep = ",")
Name <- c("CON_pre4","HFD_pre2","HFD_pre3","HFD_pre4","CON_post1","CON_post2","CON_post3","CON_post4","HFD_post4","CON_pre3","CON_pre2","HFD_post2","HFD_post3","HFD_pre1","CON_pre1","HFD_post1")
mouse_info <- cbind(mouse_info, Name)
#making columns specific to condition (diet and exercise)
mouse_info <- mutate(mouse_info, Diet = ifelse(grepl("HFD", Name, ignore.case = TRUE), "HFD", "CON"))
mouse_info <- mutate(mouse_info, Exercise = ifelse(grepl("pre", Name, ignore.case = TRUE), "pre", "post"))
mouse_info <- column_to_rownames(mouse_info, var = "Name")
#HUMAN data read in
human_data <- read.delim("/Users/eramsey/Desktop/R21_210302/PreliminaryR21/human_fixed_raw_counts.txt", header = TRUE, row.names = 1, sep = "\t")
human_data <- human_data[, -c(1:3)]
human_info <- read.csv("human_info.csv", header = TRUE, sep = ",")
human_info <- human_info[c(1:58),]
#make new label names for human samples
human_info_test <- unite(human_info, col = Name, BMI, PrePost, sep = "_", remove = FALSE)
##NEED TO FIGURE OUT how to add sequence of 1-3 for every 3 columns to give unique names
#human_info_test <- lapply(human_info_test$Name, c(1:3), paste())
#rep(1:14, 4) and then paste0
numbered_list <- c(1:14, 1:15, 1:14, 1:15)
names <- paste(human_info_test$Name, numbered_list, sep = "_")
human_info_test[["Names"]] <- names
human_info <- column_to_rownames(human_info_test, var = "Names")
DESeq2 was used to normalize data
mouse_dds <- DESeqDataSetFromMatrix(mouse_counts, colData = mouse_info, design = ~ Exercise)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
mouse_dds <- DESeq(mouse_dds)
#Contrast for log fold change, pre = numerator, post = denominator
mouse_res <- results(mouse_dds, contrast = c("Exercise", "post", "pre"))
write.csv(as.data.frame(mouse_res), file = "Mouse_DESeq2_results.csv")
mouse_vsd <- vst(mouse_dds, blind = FALSE)
mouse_deseq2_pca <- DESeq2::plotPCA(mouse_vsd, intgroup = "Exercise", )
mouse_deseq2_pca
DESeq2 was used to normalize data
#HUMAN normalization
human_dds <- DESeqDataSetFromMatrix(human_data, colData = human_info, design = ~ PrePost)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
human_dds <- DESeq(human_dds)
human_res <- results(human_dds, contrast = c("PrePost", "Post", "Pre"))
write.csv(human_res, file = "210518_Human_DESeq_Res.csv")
human_vsd <- vst(human_dds, blind = FALSE)
# trying to get lfc in order
human_de <- cbind(LFC = human_res$log2FoldChange, Pval = human_res$pvalue)
row.names(human_de) <- row.names(human_res)
human_de_df <- as.data.frame(human_de)
human_de_ordered <- human_de_df[order(-human_de_df$LFC), , drop = FALSE]
human_vsd_matrix <- assay(human_vsd)
human_deseq2_pca <- plotPCA(human_vsd, intgroup = "PrePost")
human_deseq2_pca
rownames(human_vsd_matrix) <- sub("\\..*", "", rownames(human_vsd_matrix))
#back to df and make ensembl gene rownames into a column
human_vsd_matrix <- human_vsd_matrix %>% as.data.frame() %>% rownames_to_column(., "ENSEMBL")
Mouse Gene ID Conversion
#preparing outputs for conversion
mouse_vsd_matrix <- assay(mouse_vsd)
rownames(mouse_vsd_matrix) <- sub("\\..*", "", rownames(mouse_vsd_matrix))
mouse_vsd2 <- mouse_vsd_matrix %>% as.data.frame() %>% rownames_to_column(., "ENSEMBL")
write.csv(mouse_info, file= "New_Mouse_Metadata")
AnnotationDbi for Mouse Symbols
#annotation
library(org.Mm.eg.db)
#retrieve conversion info from one ID type to another
#REPLACE test_mouse to mouse_anno
mouse_anno <- AnnotationDbi::select(org.Mm.eg.db, keys = rownames(mouse_vsd_matrix), columns = c("SYMBOL", "GO"),keytype = "ENSEMBL")
#determine indices for non-NA genes
mousenon_na_symbols <- which(is.na(mouse_anno$SYMBOL) == FALSE)
#return only the genes with annotations using indices
mouse_anno <- mouse_anno[mousenon_na_symbols, ]
#determine indices for non-duplicated genes
mouseno_dups_symbols <- which(duplicated(mouse_anno$SYMBOL) == FALSE)
#return only non-dup genes using indices
mouse_anno <- mouse_anno[mouseno_dups_symbols, ]
#add symbols to normalized mouse data
mouse_symbol <- inner_join(mouse_anno, mouse_vsd2, by = "ENSEMBL")
#has GO annotation as well as the other gene ID info
mouse_symbol_GO <- column_to_rownames(mouse_symbol, var = "SYMBOL")
#removing annotations, symbols for row names, format for analysis
mouse_symbol <- mouse_symbol_GO[,c(5:20)]
AnnotationDbi for Human Symbols
#annotation
#retrieve conversion info from one ID type to another
symbols_human <- AnnotationDbi::select(org.Hs.eg.db, keys = human_vsd_matrix$ENSEMBL, columns = c("SYMBOL", "GO"),keytype = "ENSEMBL")
## 'select()' returned 1:many mapping between keys and columns
#determine indices for non-NA genes
non_na <- which(is.na(symbols_human$SYMBOL) == FALSE)
#return only the genes with annotations using indices
symbols_human <- symbols_human[non_na, ]
#determine indices for non-duplicated genes
no_dups_human <- which(duplicated(symbols_human$SYMBOL) == FALSE)
#return only non-dup genes using indices
symbols_human <- symbols_human[no_dups_human, ]
human_symbol_data <- inner_join(symbols_human, human_vsd_matrix, by = "ENSEMBL")
#has GO annotation as well as the other gene ID info
human_symbol_data_GO <- column_to_rownames(human_symbol_data, var = "SYMBOL")
#removing annotations, symbols for row names, format for analysis
human_symbol_data <- human_symbol_data_GO[,5:62]
To be able to compare gene expression data from mouse to human, must convert first to orthologous genes
## Basic function to convert mouse to human gene names
convertMouseGeneList <- function(x){
require("biomaRt")
human = useMart("ensembl", dataset = "hsapiens_gene_ensembl")
mouse = useMart("ensembl", dataset = "mmusculus_gene_ensembl")
genesV2 = getLDS(attributes = c("mgi_symbol"), filters = "mgi_symbol", values = x , mart = mouse, attributesL = c("hgnc_symbol"), martL = human, uniqueRows=T)
return(genesV2)
}
#Use convertMouseGeneList to convert to human genes
mouse_to_human_genes <- convertMouseGeneList(mouse_anno$SYMBOL)
## Ensembl site unresponsive, trying useast mirror
## Ensembl site unresponsive, trying useast mirror
conv_mouse <- mouse_symbol %>% rownames_to_column(., var = "MGI.symbol") %>% left_join(., mouse_to_human_genes, by = "MGI.symbol")
#determine non-NA genes
non_na_mouse <- which(is.na(conv_mouse$HGNC.symbol) == FALSE)
#return only the genes with annotations using indices
conv_mouse <- conv_mouse[non_na_mouse, ]
#determine indices for non-duplicated genes
no_dups_mouse <- which(duplicated(conv_mouse$HGNC.symbol) == FALSE)
#return only non-dup genes using indices
conv_mouse <- conv_mouse[no_dups_mouse, ]
rownames(conv_mouse) <- NULL
conv_mouse <- conv_mouse %>% as.data.frame() %>% column_to_rownames(., var = "HGNC.symbol")
conv_mouse <- conv_mouse[,c(2:17)]
How many patterns? Might need to finish FEA to figure out.
#MOUSE NMF
#parameters for GoGAPS
params <- new("CogapsParams")
params <- setParam(params, "seed", 1000)
params <- setParam(params, "nPatterns", 9)
#CoGAPS to find patterns in the data
#set parameters for number of patterns
AP_mouse <- CoGAPS(conv_mouse, params, nIterations = 5000)
##
## This is CoGAPS version 3.10.0
## Running Standard CoGAPS on conv_mouse (18200 genes and 16 samples) with parameters:
##
## -- Standard Parameters --
## nPatterns 9
## nIterations 5000
## seed 1000
## sparseOptimization FALSE
##
## -- Sparsity Parameters --
## alpha 0.01
## maxGibbsMass 100
getMeanChiSq(AP_mouse)
## [1] 33413.22
For Loop for Chi Square Matrix from CoGAPS Runs
#CoGAPS to find patterns in the data
AP_human <- CoGAPS(human_symbol_data, params, nIterations = 5000)
##
## This is CoGAPS version 3.10.0
## Running Standard CoGAPS on human_symbol_data (35170 genes and 58 samples) with parameters:
##
## -- Standard Parameters --
## nPatterns 9
## nIterations 5000
## seed 1000
## sparseOptimization FALSE
##
## -- Sparsity Parameters --
## alpha 0.01
## maxGibbsMass 100
getMeanChiSq(AP_human)
## [1] 523937
plot(AP_human)
Boxplot for Patterns in Mouse
Mouse Pattern Violin Plots
mouse_featureLoadings <- AP_mouse@featureLoadings %>% as.data.frame()
boxplot_loadings <- boxplot(mouse_featureLoadings, xlab = "NMF Patterns", ylab = "Amplitude", las = 2, main = "Mouse Feature Loadings")
Base R Boxplot for Patterns in Human
human_samplefactors <- AP_human@sampleFactors %>% as.data.frame()
#%>% rownames_to_column(., var = "Sample")
human_samplefactors <- cbind(human_samplefactors, Exercise = human_info$PrePost)
#split into two matrices, pre and post, and then use add to "add boxplot to current plot"
#group pre/post exercise into two different matrices, and then combine them into a boxplot
pre_exercise_human <- dplyr::filter(human_samplefactors, Exercise == "Pre")
pre_exercise_human <- subset(pre_exercise_human, select = -Exercise)
post_exercise_human <- dplyr::filter(human_samplefactors, Exercise == "Post")
post_exercise_human <- subset(post_exercise_human, select = -Exercise)
#base r boxplot
boxplot_factors_post_human <- boxplot(post_exercise_human, las = 2, ylab = "Sample Factors", main = "Human Post Exercise Patterns", col = "powderblue")
stripchart(post_exercise_human, vertical = TRUE, method = "jitter", add = TRUE, pch = 20, col = 'black') #add data points
boxplot_factors_pre_human <- boxplot(pre_exercise_human, las = 2, ylab = "Sample Factors", main = "Human Pre Exercise Patterns", col = "plum2")
stripchart(pre_exercise_human, vertical = TRUE, method = "jitter", add = TRUE, pch = 20, col = 'black')
ggPlot Boxplot for Patterns in Human
human_sampleFactors <- AP_human@sampleFactors
human_sampleFactors_long <- human_sampleFactors %>% as.data.frame() %>% rownames_to_column(., var = "Sample") %>% pivot_longer( cols = -Sample, names_to = "Patterns", values_to = "Factors") %>% mutate(Exercise = ifelse(grepl("Pre", Sample, ignore.case = TRUE), "Pre", "Post")) %>% unite( col = "Exercise_Pattern", Patterns, Exercise, sep = "_", remove = FALSE)
human_NMF_boxplot <- ggplot(human_sampleFactors_long, aes(x = Exercise_Pattern, y = Factors)) +
geom_boxplot(color = "gray60") +
geom_point(aes(color = Exercise)) +
geom_jitter(size = 2, alpha = 0.25, width = 0.2) +
scale_colour_viridis_d() +
labs(x = "NMF Patterns", y = "Sample Weights") +
theme(panel.grid = element_blank(), axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1, size = 16), axis.title = element_text(size = 16), title = element_text(size = 18), legend.text = element_text(size = 16)) +
theme(plot.margin = unit(c(1,1,1.5,1.2),"cm")) + #top, right, bottom, left
labs(title = "NMF Patterns in Pre/Post Exercise Samples")
human_NMF_boxplot
Heatmap set up the same as ProjectR vignette, using hierachical clustering
NMF_mouse <-heatmap.2(as.matrix(AP_mouse),col=viridis(100), trace='none',
distfun=function(c) as.dist(1-cor(t(c))) ,
cexCol=1,cexRow=.6,scale = "row", main = "Mouse NMF", labRow = rownames(conv_mouse), xlab = "Patterns",
hclustfun=function(x) hclust(x, method="average"))
Mouse NMF ggPlot Heatmap - Data Wrangling
mouse_featloadings <- AP_mouse@featureLoadings
#estimate variance for each row
var_mouse_featloadings <- apply(mouse_featloadings, 1, var)
#top 100 most variable genes
top100_var_mouse_featloadings_names <- names(sort(var_mouse_featloadings, decreasing = TRUE))[1:100]
head(top100_var_mouse_featloadings_names)
## [1] "UBE2C" "RABEP2" "TLCD3B" "TSTD2" "POLR3H" "CCDC127"
Mouse - Highest feature/gene weight averages
#to look at the highest weighted genes, going to select genes with highest averages across patterns
avg_mouse_featloadings <- apply(mouse_featloadings, 1, mean)
top75_mouse_featloadings_names <- names(sort(avg_mouse_featloadings, decreasing = TRUE))[1:75]
head(top75_mouse_featloadings_names)
## [1] "RABEP2" "UBE2C" "TLCD3B" "POLR3H" "MAP3K15" "TSTD2"
Mouse data wrangling
#data wrangling
top75_mouse_featloadings <- mouse_featloadings[top75_mouse_featloadings_names,]
top75_mouse_featloadings <- as.data.frame(top75_mouse_featloadings)
top75_mouse_featloadings <- rownames_to_column(top75_mouse_featloadings, var = "genes")
dim(top75_mouse_featloadings)
## [1] 75 10
head(top75_mouse_featloadings)
## genes Pattern_1 Pattern_2 Pattern_3 Pattern_4 Pattern_5 Pattern_6
## 1 RABEP2 0.01983772 2.230480 2.168315 2.404643 2.048885 1.338395
## 2 UBE2C 0.05603908 1.818706 1.650107 2.536131 2.197390 1.658526
## 3 TLCD3B 0.05460929 2.058435 1.727842 1.579264 1.782673 1.802371
## 4 POLR3H 0.03003079 1.273096 1.445761 1.613505 1.472139 1.207789
## 5 MAP3K15 0.06058228 1.170291 1.451192 1.810609 1.619943 1.126586
## 6 TSTD2 0.06705469 1.428910 1.599869 1.297245 1.245230 1.212494
## Pattern_7 Pattern_8 Pattern_9
## 1 2.414575 0.7157919 2.704867
## 2 2.030578 0.4868053 3.071728
## 3 2.551762 0.3860395 1.918536
## 4 1.695494 0.3991383 2.013255
## 5 1.330572 0.5106807 1.855779
## 6 1.202255 0.2561178 2.162539
Mouse Heatmap
https://jcoliver.github.io/learn-r/009-expression-heatmaps.html
mouse_featloadings_long <- pivot_longer(data = top75_mouse_featloadings,
cols = -genes,
names_to = "pattern",
values_to = "feat_weight")
head(mouse_featloadings_long)
## # A tibble: 6 x 3
## genes pattern feat_weight
## <chr> <chr> <dbl>
## 1 RABEP2 Pattern_1 0.0198
## 2 RABEP2 Pattern_2 2.23
## 3 RABEP2 Pattern_3 2.17
## 4 RABEP2 Pattern_4 2.40
## 5 RABEP2 Pattern_5 2.05
## 6 RABEP2 Pattern_6 1.34
mouse_featloadings_long$log.weight <- log(mouse_featloadings_long$feat_weight)
Heatmap visualization of Amplitude/features/gene weight matrix (CoGAPS output)
mouse_featloadings_heatmap <- ggplot(mouse_featloadings_long, mapping = aes(x = pattern, y = genes, fill = log.weight)) + geom_tile() + xlab(label = "Patterns") + ylab(label = "Genes") + theme(axis.text.x = element_text(angle = 45, vjust = 0.5))
mouse_featloadings_heatmap
ggsave("210618_NMF_mouse_featloadings_heatmap.pdf", mouse_featloadings_heatmap)
Mouse Gene/Feature NMF Matrix Heatmap in ComplexHeatmap
Heatmap visualization of Amplitude/features/gene weight matrix (CoGAPS output)
##ggplot Heatmap
human_featloadings <- AP_human@featureLoadings
#estimate variance for each row
var_human_featloadings <- apply(human_featloadings, 1, var)
#top 100 most variable genes
top20_var_human_featloadings_names <- names(sort(var_human_featloadings, decreasing = TRUE))[1:20]
head(top20_var_human_featloadings_names)
## [1] "TTN-AS1" "LOC101927055" "ND6" "RIF1" "TMEM30A-DT"
## [6] "ATP2A1-AS1"
top20_var_human_featloadings_matrix <- human_featloadings[top20_var_human_featloadings_names,]
Human NMF Heatmap
Clustered heatmap of human NMF feature/amplitude/gene matrix, selecting for top 20 gene weights with greatest variance amongst patterns
#using wide-form of data to cluster
#top20_human_matrix <- as.numeric(top20_human_featloadings)
#png(file = "/Users/eramsey/Desktop/R21_210302/PreliminaryR21/210706_Human_NMF_Variance_GeneWeights.png")
heatmap.2(top20_var_human_featloadings_matrix,col=viridis, trace='none', margin=c(11,11),
distfun=function(c) as.dist(1-cor(t(c))) , xlab = "NMF Patterns",
srtCol = 45, cexCol=2,cexRow=1.7, scale = "row",
hclustfun=function(x) hclust(x, method="average"), main = "Most Variable Genes Across NMF Patterns", cex.main = 6
)
#dev.off()
Human - highest average weighted genes
#to look at the highest weighted genes, going to select genes with highest averages across patterns
avg_human_featloadings <- apply(human_featloadings, 1, mean)
top20_human_featloadings_names <- names(sort(avg_human_featloadings, decreasing = TRUE))[1:20]
head(top20_human_featloadings_names)
## [1] "LOC101927055" "TTN-AS1" "ND6" "RIF1" "ND5"
## [6] "LOC105377590"
Human - data wrangling
#data wrangling
top20_human_featloadings_matrix <- human_featloadings[top20_human_featloadings_names,]
top20_human_featloadings <- as.data.frame(top20_human_featloadings_matrix)
top20_human_featloadings <- rownames_to_column(top20_human_featloadings, var = "genes")
dim(top20_human_featloadings)
## [1] 20 10
head(top20_human_featloadings)
## genes Pattern_1 Pattern_2 Pattern_3 Pattern_4 Pattern_5 Pattern_6
## 1 LOC101927055 0.9434009 0.06260934 0.3319990 5.652393 0.2224951 0.4111889
## 2 TTN-AS1 0.4823115 0.08893458 0.2780286 6.151776 0.1403035 0.1835457
## 3 ND6 0.4840103 0.07370391 0.2058961 6.440374 0.1127677 0.3561757
## 4 RIF1 0.9708761 0.05942313 0.1315845 5.017206 0.0838132 0.2022305
## 5 ND5 1.4101026 0.11987590 0.3140898 3.539114 0.1579720 0.3508668
## 6 LOC105377590 0.7213866 0.07830001 0.3563071 4.129803 0.1790764 0.3498320
## Pattern_7 Pattern_8 Pattern_9
## 1 6.652137 0.2572868 0.07669174
## 2 6.941866 0.2912224 0.04211363
## 3 5.250072 0.2038665 0.04429376
## 4 4.866413 0.0732412 0.04299175
## 5 4.614510 0.2301526 0.02281302
## 6 4.097141 0.2954546 0.05812084
Human - data wrangling
https://jcoliver.github.io/learn-r/009-expression-heatmaps.html
human_featloadings_long <- pivot_longer(data = top20_human_featloadings,
cols = -genes,
names_to = "pattern",
values_to = "feat_weight")
head(human_featloadings_long)
## # A tibble: 6 x 3
## genes pattern feat_weight
## <chr> <chr> <dbl>
## 1 LOC101927055 Pattern_1 0.943
## 2 LOC101927055 Pattern_2 0.0626
## 3 LOC101927055 Pattern_3 0.332
## 4 LOC101927055 Pattern_4 5.65
## 5 LOC101927055 Pattern_5 0.222
## 6 LOC101927055 Pattern_6 0.411
human_featloadings_long$log.weight <- log(human_featloadings_long$feat_weight)
head(human_featloadings_long)
## # A tibble: 6 x 4
## genes pattern feat_weight log.weight
## <chr> <chr> <dbl> <dbl>
## 1 LOC101927055 Pattern_1 0.943 -0.0583
## 2 LOC101927055 Pattern_2 0.0626 -2.77
## 3 LOC101927055 Pattern_3 0.332 -1.10
## 4 LOC101927055 Pattern_4 5.65 1.73
## 5 LOC101927055 Pattern_5 0.222 -1.50
## 6 LOC101927055 Pattern_6 0.411 -0.889
Human NMF Heatmap Human Gene/Feature NMF Matrix Heatmap in ggplot, selecting for top highest weightest averages for genes amonst all patterns
human_featloadings_heatmap <- ggplot(human_featloadings_long, mapping = aes(x = pattern, y = genes, fill = log.weight)) +
geom_tile() +
labs(title = "Human NMF Feature Weights") +
xlab(label = "Patterns") +
ylab(label = "Top Weighted Genes") +
theme(axis.text.x = element_text(angle = 45, vjust = 0.5), axis.text.y = element_text(size = 5)) +
scale_fill_viridis(option = "plasma")
human_featloadings_heatmap
The mouse NMF patterns can then be projected onto the human data
Scatterplot to compare patterns
human_data <- as.matrix(human_symbol_data)
human_NMF <- projectR(human_data,loadings=AP_mouse, full=TRUE,
dataNames=rownames(human_data))
## [1] "18108 row names matched between data and loadings"
## [1] "Updated dimension of data: 18108 58"
#pval_NMF <- human_NMF$pval
projection_NMF <- human_NMF$projection %>% t() %>% cbind(., human_info)
#plot scatterplot
projection_plot <- ggplot(projection_NMF, aes(x = Pattern_3, y = Pattern_9, colour = PrePost)) + geom_point() + ggtitle("Mouse Data NMF Patterns Projected onto Human Data")
projection_plot
#ggsave("NMF_Mouse_Projected_to_Human_14pat5000it.pdf", projection_plot)
ggsave("210628_Scatterplot_Projected_MousetoHuman_Patterns3and9.pdf", projection_plot)
## Saving 7 x 5 in image
PCA to Compare Projected Patterns
#PCA
projection_NMF_mouse2human_pca <- prcomp(human_NMF$projection)
projection_NMF_mouse2human_pca_plot <- ggplot(as.data.frame(projection_NMF_mouse2human_pca$x), aes(x=PC1,y=PC2, colour = "red", "blue")) +geom_point()
#projection_NMF_mouse2human_pca_plot
pc_human <- prcomp(t(human_symbol_data))
pc_mouse <- prcomp(t(conv_mouse))
#find variance
pc_var_mouse <- round(((pc_mouse$sdev)^2/sum(pc_mouse$sdev^2))*100,2)
pca_mouse_df <- data.frame(cbind(pc_mouse$x, mouse_info))
Boxplot to Compare Pre/Post Projected Patterns
NMF_projection_long <- projection_NMF %>% rownames_to_column(., var = "Sample") %>% pivot_longer(starts_with("Pattern_"), names_to = "Patterns", values_to = "Factors") %>% unite( col = "Exercise_Pattern", Patterns, PrePost, sep = "_", remove = FALSE)
projection_NMF_boxplot <- ggplot(NMF_projection_long, aes(x = Exercise_Pattern, y = Factors)) +
geom_boxplot(color = "gray60") +
geom_point(aes(color = PrePost)) +
geom_jitter(size = 2, alpha = 0.25, width = 0.2) +
scale_colour_viridis_d() +
labs(x = "Projected Pre and Post Exercise Patterns", y = "Factors") +
theme(panel.grid = element_blank(), axis.text.x = element_text(angle = 45, vjust = 0.5)) +
labs(title = "Projected Mouse NMF Pre/Post Exercise Patterns onto Human Data")
projection_NMF_boxplot
Scatterplot to Compare Pre/Post Projected Patterns
projection_NMF_scatterplot <- ggplot(NMF_projection_long, aes(x = Patterns, y = Factors)) +
geom_point(aes(color = PrePost)) +
geom_jitter(size = 2, alpha = 0.5, width = 0.2) +
scale_colour_viridis_d() +
labs(x = "Projected Pre and Post Exercise Patterns", y = "Factors") +
theme(panel.grid = element_blank(), axis.text.x = element_text(angle = 45, vjust = 0.5)) +
labs(title = "Projected Mouse NMF Pre/Post Exercise Patterns onto Human Data")
projection_NMF_scatterplot
mouse_NMF <- projectR(as.matrix(conv_mouse),loadings=AP_human, full=TRUE,
dataNames=rownames(conv_mouse))
## [1] "18108 row names matched between data and loadings"
## [1] "Updated dimension of data: 18108 16"
projection_NMF_human2mouse <- mouse_NMF$projection %>% t() %>% cbind(., mouse_info)
#plot scatterplot
projection_plot_human2mouse <- ggplot(projection_NMF_human2mouse, aes(x = Pattern_6, y = Pattern_9, colour = Exercise)) + geom_point() + ggtitle("Mouse Data NMF Patterns Projected onto Human Data")
projection_plot_human2mouse
Another ProjectR method to projections. Principal components are found for mouse and human data. Mouse PC’s are used for loadings and projected onto human data.
Make PC’s
pc_human <- prcomp(t(human_symbol_data))
pc_mouse <- prcomp(t(conv_mouse))
#find variance
pc_var_mouse <- round(((pc_mouse$sdev)^2/sum(pc_mouse$sdev^2))*100,2)
pca_mouse_df <- data.frame(cbind(pc_mouse$x, mouse_info))
Project PC’s
PCA_projectr <- projectR(human_data, loadings = pc_mouse, full = TRUE, dataNames = rownames(human_data))
## [1] "18108 row names matched between data and loadings"
## [1] "Updated dimension of data: 18108 58"
human_meta <- column_to_rownames(human_info_test, var = "SRR")
human_meta_t <- t(human_meta)
PCA_projectr_t <- t(PCA_projectr[[1]])
PCA_projectr_df <- cbind(PCA_projectr_t, human_meta)
#[1] "18282 row names matched between data and loadings"
#[1] "Updated dimension of data: 18282 58"
#Plot PCA
dPCA <- data.frame(cbind(t(PCA_projectr[[1]]),PCA_projectr_df))
projected_PCA <- ggplot(PCA_projectr_df, aes(x = PC1, y = PC2, colour = PrePost)) +
geom_point(size = 3.5) +
scale_color_viridis_d() +
theme(panel.grid = element_blank(), axis.text = element_text(size = 16), axis.title = element_text(size = 16), title = element_text(size = 18), legend.text = element_text(size = 16)) +
ggtitle("Mouse Principal Components \nProjected onto Human Data")
projected_PCA
There’s 2 methods: cluster2pattern finds correlation of each gene’s expression by mean of cluster to define continuous weights. Then intersector tests significant overlap between 2 clustering objects. For these purposes, I think cluster2pattern is better for finding what we want (differences between clusters rather than overlaps). Requires a clustering object first.
#TOP 50 DIFFERENTIALLY EXPRESSED GENES
#checking standard deviation for each gene
stdevs <- apply(conv_mouse, 1, sd)
mouse_stdev <- cbind(conv_mouse, stdevs)
mouse_stdev <- slice_max(mouse_stdev, order_by = stdevs, n = 250)
mouse_top250 <- mouse_stdev[,-17]
write.csv2(mouse_top250, file = "Mouse_Top250_DEG.csv")
library(fpc)
#pamk: partitioning around medoids clustering with the number of clusters estimated by optimum average silhouette width
#part_clust_mouse <- pamk(stdevs, scaling = TRUE)
#plot(part_clust_mouse$pamobject)
# Hierarchical Clustering
mouse_dist <- dist(mouse_top250, method = "euclidean") # distance matrix
hclust_fit <- hclust(mouse_dist, method="average")
# Hierarchical Clustering with Bootstrapped p values
library(pvclust)
#pvclust() provides p-values for hierarchical clustering based on multiscale bootstrap resampling-clusters that are highly supported by the data will have large p values.
#pvclust clusters columns, not rows: transpose your data before using
fit <- pvclust(mouse_top250, method.hclust="average",
method.dist="euclidean")
## Bootstrap (r = 0.5)... Done.
## Bootstrap (r = 0.6)... Done.
## Bootstrap (r = 0.7)... Done.
## Bootstrap (r = 0.8)... Done.
## Bootstrap (r = 0.9)... Done.
## Bootstrap (r = 1.0)... Done.
## Bootstrap (r = 1.1)... Done.
## Bootstrap (r = 1.2)... Done.
## Bootstrap (r = 1.3)... Done.
## Bootstrap (r = 1.4)... Done.
hclust_dendro_sig <- plot(fit)
pvrect(fit, alpha=.90) # add rectangles around groups highly supported by the data
Clustering in Human Data
Hierarchical Clustering with Bootstrapped p values
pvclust() provides p-values for hierarchical clustering based on multiscale bootstrap resampling-clusters that are highly supported by the data will have large p values. (Note: pvclust clusters columns, not rows: transpose your data before using)
#TOP 50 DIFFERENTIALLY EXPRESSED GENES
human_stdevs <- apply(human_symbol_data, 1, sd)
human_stdev <- cbind(human_symbol_data, human_stdevs)
human_stdev <- slice_max(human_stdev, order_by = human_stdevs, n = 50)
human_top250 <- human_stdev[,-17]
write.csv2(human_top250, file = "Human_Top50_DEG.csv")
fit_human <- pvclust(human_top250, method.hclust="average",
method.dist="euclidean")
## Bootstrap (r = 0.5)... Done.
## Bootstrap (r = 0.6)... Done.
## Bootstrap (r = 0.7)... Done.
## Bootstrap (r = 0.8)... Done.
## Bootstrap (r = 0.9)... Done.
## Bootstrap (r = 1.0)... Done.
## Bootstrap (r = 1.1)... Done.
## Bootstrap (r = 1.2)... Done.
## Bootstrap (r = 1.3)... Done.
## Bootstrap (r = 1.4)... Done.
human_hclust_dendro_sig <- plot(fit_human)
#pvrect(fit, alpha=.90) # add rectangles around groups highly supported by the data
Output of the cluster2pattern function is a pclust class object; specifically, a matrix of genes (rows) by clusters (columns). A gene’s value outside of its assigned cluster is zero. For the cluster containing a given gene, the gene’s value is the correlation of the gene’s expression to the mean of that cluster.
mouse_clusters <- cluster2pattern(hclust_fit, NP = 2, Data = mouse_top250)
Projecting mouse clusters onto human data
#need to use the top 50 genes from mouse DE to pull those genes from human and then project cluster object from cluster2pattern onto human data
top50_list <- rownames_to_column(mouse_top250, var = "SYMBOL")
human_list <- rownames_to_column(human_symbol_data_GO, var = "SYMBOL")
top50_list <- semi_join(human_list, top50_list, by = "SYMBOL")
mouseToHumanTop50 <- column_to_rownames(top50_list, var = "SYMBOL")
mouseToHumanTop50 <- mouseToHumanTop50[,5:62]
mouseToHumanTop50 <- as.matrix(mouseToHumanTop50)
cluster_projection <- projectR(mouseToHumanTop50, loadings = mouse_clusters, full = TRUE, dataNames = rownames(mouseToHumanTop50))
## [1] "250 row names matched between data and loadings"
## [1] "Updated dimension of data: 250 58"
#combining projection with metadata and preparing for PCA
cluster_projection_t <- t(cluster_projection[[1]])
cluster_projection_df <- cbind(cluster_projection_t, human_meta)
cluster_projection_plot <- ggplot(cluster_projection_df, aes(x = x1, y = x2, colour = PrePost)) + geom_point() + ggtitle("Mouse Clusters Projected onto Human Data")
cluster_projection_plot
ggsave("Mouse_Clusters_Projection.pdf", cluster_projection_plot)
## Saving 7 x 5 in image
Mouse ggprofiler https://biit.cs.ut.ee/gprofiler/page/r
g:Profiler uses all the genes annotated to that term as an input (in this case about six hundred human genes associated to heart development). Fully numeric identifiers need to be prefixed with the corresponding namespace. g:Profiler will automatically prefix all the detected numeric IDs using the prefix determined by the selected numeric namespace parameter.
gost enables to perform functional profiling of gene lists. The function performs statistical enrichment analysis to find over-representation of functions from Gene Ontology, biological pathways like KEGG and Reactome, human disease annotations, etc. This is done with the hypergeometric test followed by correction for multiple testing.
In order to reduce the amount of false positives, a multiple testing correction method is applied to the enrichment p-values. By default, our tailor-made algorithm g:SCS is used (correction_method = “gSCS” with synonyms g_SCS and analytical), but there are also options to apply the Bonferroni correction (correction_method = “bonferroni”) or FDR (correction_method = “fdr”). The adjusted p-values are reported in the results.
Human FEA using gprofiler
#input needs to be a dataframe with genes and p values, so each pattern will need its own dataframe
#sample_factors has already been extracted from AP_human, need to do the same with the stdev
human_feature <- as.data.frame(AP_human@featureLoadings)
human_feature_stdev <- as.data.frame(AP_human@loadingStdDev)
pattern5_human <- dplyr::select(human_feature, Pattern_5) %>% cbind(., StdDev = human_feature_stdev$Pattern_5)
head(pattern5_human)
## Pattern_5 StdDev
## DDX11L2 0.08131652 0.1860833
## DDX11L16 0.03840808 0.1213581
## DDX11L1 0.09497906 0.2578371
## DDX11L5 0.03780625 0.1084210
## DDX11L17 0.15602939 0.2845393
## WASH7P 0.11989398 0.2513702
pattern1_human <- dplyr::select(human_feature, Pattern_1) %>% cbind(., StdDev = human_feature_stdev$Pattern_1)
head(pattern1_human)
## Pattern_1 StdDev
## DDX11L2 1.1809987 0.8131181
## DDX11L16 0.2364825 0.4237808
## DDX11L1 0.1353610 0.2749353
## DDX11L5 0.4262073 0.6837504
## DDX11L17 0.5320511 0.5834329
## WASH7P 1.0691203 0.6250860
#want to look at just the significant genes of pattern 5
pattern5_sig <- pattern5_human[pattern5_human$StdDev < 0.1,]
pattern1_sig <- pattern1_human[pattern1_human$StdDev < 0.1,]
pattern5_human_names <- rownames(pattern5_sig) %>% as.list()
names(pattern5_human_names) <- rownames(pattern5_sig)
Subsetting human data differently
human_feature_df <- rownames_to_column(human_feature, var = "Genes")
human_nmf_geneweight_long <- pivot_longer(data = human_feature_df,
cols = -Genes,
names_to = "Pattern",
values_to = "Gene_weight")
head(human_nmf_geneweight_long)
## # A tibble: 6 x 3
## Genes Pattern Gene_weight
## <chr> <chr> <dbl>
## 1 DDX11L2 Pattern_1 1.18
## 2 DDX11L2 Pattern_2 0.142
## 3 DDX11L2 Pattern_3 0.309
## 4 DDX11L2 Pattern_4 0.0963
## 5 DDX11L2 Pattern_5 0.0813
## 6 DDX11L2 Pattern_6 0.278
human_pattern2 <- human_nmf_geneweight_long %>% dplyr::filter( Pattern == "Pattern_2") %>% slice_max(order_by = Gene_weight, n = 150)
human_pattern5 <- human_nmf_geneweight_long %>% dplyr::filter( Pattern == "Pattern_5") %>% slice_max(order_by = Gene_weight, n = 150)
human_pattern9 <- human_nmf_geneweight_long %>% dplyr::filter( Pattern == "Pattern_9") %>% slice_max(order_by = Gene_weight, n = 150)
human_pattern2_names <- as.list(human_pattern2$Genes)
names(human_pattern2_names) <- human_pattern2$Genes
human_pattern5_names <- as.list(human_pattern5$Genes)
names(human_pattern5_names) <- human_pattern5$Genes
human_pattern9_names <- as.list(human_pattern9$Genes)
names(human_pattern9_names) <- human_pattern9$Genes
Human gprofiler gost() runs
#look into exclude_iea and correction_methods
gost_res_2 <- gost(query = human_pattern2_names, organism = "hsapiens", ordered_query = FALSE, significant = TRUE, exclude_iea = FALSE, measure_underrepresentation = FALSE, evcodes = FALSE,
user_threshold = 0.05, correction_method = "g_SCS",
domain_scope = "annotated", custom_bg = NULL,
numeric_ns = "", sources = c("REAC", "KEGG", "CORUM"), as_short_link = FALSE)
head(gost_res_2$result)
## query significant p_value term_size query_size intersection_size
## 1 ATP1A2 TRUE 0.04423077 23 1 1
## 2 ATP2A1 TRUE 0.04994746 2 1 1
## 3 CANX TRUE 0.02497194 2 1 1
## 4 CANX TRUE 0.02497194 2 1 1
## 5 GOT2 TRUE 0.01153846 6 1 1
## 6 GOT2 TRUE 0.03269231 17 1 1
## precision recall term_id source
## 1 1 0.04347826 KEGG:04964 KEGG
## 2 1 0.50000000 CORUM:6514 CORUM
## 3 1 0.50000000 REAC:R-HSA-168268 REAC
## 4 1 0.50000000 REAC:R-HSA-168316 REAC
## 5 1 0.16666667 KEGG:00400 KEGG
## 6 1 0.05882353 KEGG:00360 KEGG
## term_name effective_domain_size
## 1 Proximal tubule bicarbonate reclamation 8000
## 2 SLN-ATP2A1 complex 3627
## 3 Virus Assembly and Release 10622
## 4 Assembly of Viral Components at the Budding Site 10622
## 5 Phenylalanine, tyrosine and tryptophan biosynthesis 8000
## 6 Phenylalanine metabolism 8000
## source_order parents
## 1 377 KEGG:00000
## 2 2189 CORUM:0000000
## 3 2425 REAC:R-HSA-168255
## 4 164 REAC:R-HSA-168268
## 5 51 KEGG:00000
## 6 44 KEGG:00000
gost_res_5 <- gost(query = human_pattern5_names, organism = "hsapiens", ordered_query = FALSE, significant = TRUE, exclude_iea = FALSE, measure_underrepresentation = FALSE, evcodes = FALSE,
user_threshold = 0.05, correction_method = "g_SCS",
domain_scope = "annotated", custom_bg = NULL,
numeric_ns = "", sources = c("REAC", "KEGG", "CORUM"), as_short_link = FALSE)
head(gost_res_5$result)
## query significant p_value term_size query_size intersection_size
## 1 ATP1A4 TRUE 0.04423077 23 1 1
## 2 COQ6 TRUE 0.02115385 11 1 1
## 3 DTNA TRUE 0.04994746 2 1 1
## 4 FXR1 TRUE 0.04994746 2 1 1
## 5 HSP90AA1 TRUE 0.04994746 2 1 1
## 6 HSP90AA1 TRUE 0.04994746 2 1 1
## precision recall term_id source
## 1 1 0.04347826 KEGG:04964 KEGG
## 2 1 0.09090909 KEGG:00130 KEGG
## 3 1 0.50000000 CORUM:7341 CORUM
## 4 1 0.50000000 CORUM:5360 CORUM
## 5 1 0.50000000 CORUM:4158 CORUM
## 6 1 0.50000000 CORUM:5716 CORUM
## term_name effective_domain_size
## 1 Proximal tubule bicarbonate reclamation 8000
## 2 Ubiquinone and other terpenoid-quinone biosynthesis 8000
## 3 CTNNAL1-DNTA complex 3627
## 4 AGO2-FXR1-TNF(alpha)ARE-RNP complex 3627
## 5 HSP90-FKBP38-CAM-Ca(2+) complex 3627
## 6 NOS3-HSP90 complex, VEGF induced 3627
## source_order parents
## 1 377 KEGG:00000
## 2 17 KEGG:00000
## 3 2781 CORUM:0000000
## 4 1646 CORUM:0000000
## 5 1539 CORUM:0000000
## 6 1782 CORUM:0000000
gost_res_9 <- gost(query = human_pattern9_names, organism = "hsapiens", ordered_query = FALSE, significant = TRUE, exclude_iea = FALSE, measure_underrepresentation = FALSE, evcodes = FALSE,
user_threshold = 0.05, correction_method = "g_SCS",
domain_scope = "annotated", custom_bg = NULL,
numeric_ns = "", sources = c("REAC", "KEGG", "CORUM"), as_short_link = FALSE)
head(gost_res_9$result)
## query significant p_value term_size query_size intersection_size
## 1 ATXN1 TRUE 0.04994746 2 1 1
## 2 CYP27A1 TRUE 0.03269231 17 1 1
## 3 CYP27A1 TRUE 0.01248597 1 1 1
## 4 ESR2 TRUE 0.04994746 2 1 1
## 5 FUT2 TRUE 0.02884615 15 1 1
## 6 FUT2 TRUE 0.02497194 2 1 1
## precision recall term_id source
## 1 1 0.50000000 CORUM:7236 CORUM
## 2 1 0.05882353 KEGG:00120 KEGG
## 3 1 1.00000000 REAC:R-HSA-5578996 REAC
## 4 1 0.50000000 CORUM:6087 CORUM
## 5 1 0.06666667 KEGG:00603 KEGG
## 6 1 0.50000000 REAC:R-HSA-9033807 REAC
## term_name
## 1 ARXN1-CIC complex
## 2 Primary bile acid biosynthesis
## 3 Defective CYP27A1 causes CTX
## 4 ERbeta-AR complex
## 5 Glycosphingolipid biosynthesis - globo and isoglobo series
## 6 ABO blood group biosynthesis
## effective_domain_size source_order parents
## 1 3627 2711 CORUM:0000000
## 2 8000 15 KEGG:00000
## 3 10622 489 REAC:R-HSA-5579029
## 4 3627 1880 CORUM:0000000
## 5 8000 99 KEGG:00000
## 6 10622 7 REAC:R-HSA-9033658
Data wrangling for human alluvial point
#need to add an identifier for pattern number for each dataframe
gost_pattern2 <- cbind(gost_res_2$result, pattern = c(rep("pattern_2")))
gost_pattern5 <- cbind(gost_res_5$result, pattern = c(rep("pattern_5")))
gost_pattern9 <- cbind(gost_res_9$result, pattern = c(rep("pattern_9")))
Human alluvial plot - Patterns 2, 5, and 9
#combine dataframes
gost_pattern259_all <- rbind(gost_pattern2, gost_pattern5, gost_pattern9)
#looking at just kegg results
gost_pattern259 <- dplyr::filter(gost_pattern259_all, source == "KEGG")
#gost_pattern259_table <- apply(gost_pattern259, 2, as.character)
#write.csv2(gost_pattern259_table, file = "gost_human_results_patterns259.csv")
Alluvial Plot - Mouse
Looking at all gost results and just KEGG/Reactome/Corum
Alluvial Plot - Mouse
Looking just at gost profiler KEGG results
Alluvial Plot - Mouse
Looking just at gost profiler GO results
installed.packages()[names(sessionInfo()$otherPkgs), "Version"]
## pvclust fpc org.Mm.eg.db
## "2.2-0" "2.2-9" "3.12.0"
## org.Hs.eg.db reshape2 viridis
## "3.12.0" "1.4.4" "0.6.1"
## viridisLite EnsDb.Hsapiens.v86 ensembldb
## "0.4.0" "2.99.0" "2.14.1"
## AnnotationFilter GenomicFeatures gprofiler2
## "1.14.0" "1.42.3" "0.2.0"
## ggrepel ggalluvial tidyr
## "0.9.1" "0.12.3" "1.1.3"
## biomaRt ggbiplot scales
## "2.47.9" "0.55" "1.1.1"
## plyr devtools usethis
## "1.8.6" "2.4.2" "2.0.1"
## AnnotationDbi tibble dplyr
## "1.52.0" "3.1.2" "1.0.7"
## gplots DESeq2 SummarizedExperiment
## "3.1.1" "1.30.1" "1.20.0"
## MatrixGenerics matrixStats GenomicRanges
## "1.2.1" "0.59.0" "1.42.0"
## GenomeInfoDb IRanges S4Vectors
## "1.26.7" "2.24.1" "0.28.1"
## ggplot2 CoGAPS projectR
## "3.3.5" "3.10.0" "1.6.0"
## Biobase BiocGenerics
## "2.50.0" "0.36.1"